/* This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.

 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.

 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

/* Scaler (except Scale2x) code borrowed from ScummVM project */

#include "sysconfig.h"
#include "gfxfilter.h"

// extern int cpu_mmx;

// extern "C"
// {

static uint colorMask;
static uint lowPixelMask;
static uint qcolorMask;
static uint qlowpixelMask;
static uint redblueMask;
static uint redMask;
static uint greenMask;
static uint blueMask;

uint LUT16to32[65536];
uint RGBtoYUV[65536];

int Init_2xSaI(int rb, int gb, int bb, int rs, int gs, int bs)
{
    if (rb + gb + bb == 16)
    {
        colorMask = 0xF7DEF7DE;
        lowPixelMask = 0x08210821;
        qcolorMask = 0xE79CE79C;
        qlowpixelMask = 0x18631863;
        redblueMask = 0xF81F;
        redMask = 0xF800;
        greenMask = 0x07E0;
        blueMask = 0x001F;
    }
    else if (rb + gb + bb == 15)
    {
        colorMask = 0x7BDE7BDE;
        lowPixelMask = 0x04210421;
        qcolorMask = 0x739C739C;
        qlowpixelMask = 0x0C630C63;
        redblueMask = 0x7C1F;
        redMask = 0x7C00;
        greenMask = 0x03E0;
        blueMask = 0x001F;
    }
    else
    {
        return 0;
    }

    return 1;
}

static _inline int GetResult(uint A, uint B, uint C, uint D)
{
    const bool ac = (A == C);
    const bool bc = (B == C);
    const int x1 = ac;
    const int y1 = (bc & !ac);
    const bool ad = (A == D);
    const bool bd = (B == D);
    const int x2 = ad;
    const int y2 = (bd & !ad);
    const int x = x1 + x2;
    const int y = y1 + y2;
    static const int rmap[3][3] = {
        { 0, 0, -1 },
        { 0, 0, -1 },
        { 1, 1, 0 }
    };
    return rmap[y][x];
}

static _inline uint INTERPOLATE(uint A, uint B)
{
    if (A != B)
    {
        return (((A & colorMask) >> 1) + ((B & colorMask) >> 1) + (A & B & lowPixelMask));
    }
    else
        return A;
}

static _inline uint Q_INTERPOLATE(uint A, uint B, uint C, uint D)
{
    register uint x = ((A & qcolorMask) >> 2) + ((B & qcolorMask) >> 2) + ((C & qcolorMask) >> 2) + ((D & qcolorMask) >> 2);
    register uint y = ((A & qlowpixelMask) + (B & qlowpixelMask) + (C & qlowpixelMask) + (D & qlowpixelMask)) >> 2;

    y &= qlowpixelMask;
    return x + y;
}

void Super2xSaI_16(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    ushort* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (ushort*)dstPtr;

        for (i = 0; i < width; ++i)
        {
            uint color4, color5, color6;
            uint color1, color2, color3;
            uint colorA0, colorA1, colorA2, colorA3;
            uint colorB0, colorB1, colorB2, colorB3;
            uint colorS1, colorS2;
            uint product1a, product1b, product2a, product2b;

            // ---------------------------------------    B1 B2
            //                                          4  5  6 S2
            //                                          1  2  3 S1
            //                                            A1 A2

            colorB0 = *(bP - nextlineSrc - 1);
            colorB1 = *(bP - nextlineSrc);
            colorB2 = *(bP - nextlineSrc + 1);
            colorB3 = *(bP - nextlineSrc + 2);

            color4 = *(bP - 1);
            color5 = *(bP);
            color6 = *(bP + 1);
            colorS2 = *(bP + 2);

            color1 = *(bP + nextlineSrc - 1);
            color2 = *(bP + nextlineSrc);
            color3 = *(bP + nextlineSrc + 1);
            colorS1 = *(bP + nextlineSrc + 2);

            colorA0 = *(bP + 2 * nextlineSrc - 1);
            colorA1 = *(bP + 2 * nextlineSrc);
            colorA2 = *(bP + 2 * nextlineSrc + 1);
            colorA3 = *(bP + 2 * nextlineSrc + 2);

            // --------------------------------------
            if (color2 == color6 && color5 != color3)
            {
                product2b = product1b = color2;
            }
            else if (color5 == color3 && color2 != color6)
            {
                product2b = product1b = color5;
            }
            else if (color5 == color3 && color2 == color6)
            {
                register int r = 0;

                r += GetResult(color6, color5, color1, colorA1);
                r += GetResult(color6, color5, color4, colorB1);
                r += GetResult(color6, color5, colorA2, colorS1);
                r += GetResult(color6, color5, colorB2, colorS2);

                if (r > 0)
                    product2b = product1b = color6;
                else if (r < 0)
                    product2b = product1b = color5;
                else
                {
                    product2b = product1b = INTERPOLATE(color5, color6);
                }
            }
            else
            {
                if (color6 == color3 && color3 == colorA1 && color2 != colorA2 && color3 != colorA0)
                    product2b = Q_INTERPOLATE(color3, color3, color3, color2);
                else if (color5 == color2 && color2 == colorA2 && colorA1 != color3 && color2 != colorA3)
                    product2b = Q_INTERPOLATE(color2, color2, color2, color3);
                else
                    product2b = INTERPOLATE(color2, color3);

                if (color6 == color3 && color6 == colorB1 && color5 != colorB2 && color6 != colorB0)
                    product1b = Q_INTERPOLATE(color6, color6, color6, color5);
                else if (color5 == color2 && color5 == colorB2 && colorB1 != color6 && color5 != colorB3)
                    product1b = Q_INTERPOLATE(color6, color5, color5, color5);
                else
                    product1b = INTERPOLATE(color5, color6);
            }

            if (color5 == color3 && color2 != color6 && color4 == color5 && color5 != colorA2)
                product2a = INTERPOLATE(color2, color5);
            else if (color5 == color1 && color6 == color5 && color4 != color2 && color5 != colorA0)
                product2a = INTERPOLATE(color2, color5);
            else
                product2a = color2;

            if (color2 == color6 && color5 != color3 && color1 == color2 && color2 != colorB2)
                product1a = INTERPOLATE(color2, color5);
            else if (color4 == color2 && color3 == color2 && color1 != color5 && color2 != colorB0)
                product1a = INTERPOLATE(color2, color5);
            else
                product1a = color5;

            *(dP + 0) = (ushort)product1a;
            *(dP + 1) = (ushort)product1b;
            *(dP + dstPitch / 2 + 0) = (ushort)product2a;
            *(dP + dstPitch / 2 + 1) = (ushort)product2b;

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}

void Super2xSaI_32(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    uint* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (uint*)dstPtr;

        for (i = 0; i < width; ++i)
        {
            uint color4, color5, color6;
            uint color1, color2, color3;
            uint colorA0, colorA1, colorA2, colorA3;
            uint colorB0, colorB1, colorB2, colorB3;
            uint colorS1, colorS2;
            uint product1a, product1b, product2a, product2b;

            // ---------------------------------------    B1 B2
            //                                          4  5  6 S2
            //                                          1  2  3 S1
            //                                            A1 A2

            colorB0 = *(bP - nextlineSrc - 1);
            colorB1 = *(bP - nextlineSrc);
            colorB2 = *(bP - nextlineSrc + 1);
            colorB3 = *(bP - nextlineSrc + 2);

            color4 = *(bP - 1);
            color5 = *(bP);
            color6 = *(bP + 1);
            colorS2 = *(bP + 2);

            color1 = *(bP + nextlineSrc - 1);
            color2 = *(bP + nextlineSrc);
            color3 = *(bP + nextlineSrc + 1);
            colorS1 = *(bP + nextlineSrc + 2);

            colorA0 = *(bP + 2 * nextlineSrc - 1);
            colorA1 = *(bP + 2 * nextlineSrc);
            colorA2 = *(bP + 2 * nextlineSrc + 1);
            colorA3 = *(bP + 2 * nextlineSrc + 2);

            // --------------------------------------
            if (color2 == color6 && color5 != color3)
            {
                product2b = product1b = color2;
            }
            else if (color5 == color3 && color2 != color6)
            {
                product2b = product1b = color5;
            }
            else if (color5 == color3 && color2 == color6)
            {
                register int r = 0;

                r += GetResult(color6, color5, color1, colorA1);
                r += GetResult(color6, color5, color4, colorB1);
                r += GetResult(color6, color5, colorA2, colorS1);
                r += GetResult(color6, color5, colorB2, colorS2);

                if (r > 0)
                    product2b = product1b = color6;
                else if (r < 0)
                    product2b = product1b = color5;
                else
                {
                    product2b = product1b = INTERPOLATE(color5, color6);
                }
            }
            else
            {
                if (color6 == color3 && color3 == colorA1 && color2 != colorA2 && color3 != colorA0)
                    product2b = Q_INTERPOLATE(color3, color3, color3, color2);
                else if (color5 == color2 && color2 == colorA2 && colorA1 != color3 && color2 != colorA3)
                    product2b = Q_INTERPOLATE(color2, color2, color2, color3);
                else
                    product2b = INTERPOLATE(color2, color3);

                if (color6 == color3 && color6 == colorB1 && color5 != colorB2 && color6 != colorB0)
                    product1b = Q_INTERPOLATE(color6, color6, color6, color5);
                else if (color5 == color2 && color5 == colorB2 && colorB1 != color6 && color5 != colorB3)
                    product1b = Q_INTERPOLATE(color6, color5, color5, color5);
                else
                    product1b = INTERPOLATE(color5, color6);
            }

            if (color5 == color3 && color2 != color6 && color4 == color5 && color5 != colorA2)
                product2a = INTERPOLATE(color2, color5);
            else if (color5 == color1 && color6 == color5 && color4 != color2 && color5 != colorA0)
                product2a = INTERPOLATE(color2, color5);
            else
                product2a = color2;

            if (color2 == color6 && color5 != color3 && color1 == color2 && color2 != colorB2)
                product1a = INTERPOLATE(color2, color5);
            else if (color4 == color2 && color3 == color2 && color1 != color5 && color2 != colorB0)
                product1a = INTERPOLATE(color2, color5);
            else
                product1a = color5;

            *(dP + 0) = LUT16to32[(ushort)product1a];
            *(dP + 1) = LUT16to32[(ushort)product1b];
            *(dP + dstPitch / 4 + 0) = LUT16to32[(ushort)product2a];
            *(dP + dstPitch / 4 + 1) = LUT16to32[(ushort)product2b];

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}
void SuperEagle_16(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    ushort* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (ushort*)dstPtr;
        for (i = 0; i < width; ++i)
        {
            uint color4, color5, color6;
            uint color1, color2, color3;
            uint colorA1, colorA2, colorB1, colorB2, colorS1, colorS2;
            uint product1a, product1b, product2a, product2b;

            colorB1 = *(bP - nextlineSrc);
            colorB2 = *(bP - nextlineSrc + 1);

            color4 = *(bP - 1);
            color5 = *(bP);
            color6 = *(bP + 1);
            colorS2 = *(bP + 2);

            color1 = *(bP + nextlineSrc - 1);
            color2 = *(bP + nextlineSrc);
            color3 = *(bP + nextlineSrc + 1);
            colorS1 = *(bP + nextlineSrc + 2);

            colorA1 = *(bP + 2 * nextlineSrc);
            colorA2 = *(bP + 2 * nextlineSrc + 1);

            //  --------------------------------------
            if (color5 != color3)
            {
                if (color2 == color6)
                {
                    product1b = product2a = color2;
                    if ((color1 == color2) || (color6 == colorB2))
                    {
                        product1a = INTERPOLATE(color2, color5);
                        product1a = INTERPOLATE(color2, product1a);
                    }
                    else
                    {
                        product1a = INTERPOLATE(color5, color6);
                    }

                    if ((color6 == colorS2) || (color2 == colorA1))
                    {
                        product2b = INTERPOLATE(color2, color3);
                        product2b = INTERPOLATE(color2, product2b);
                    }
                    else
                    {
                        product2b = INTERPOLATE(color2, color3);
                    }
                }
                else
                {
                    product2b = product1a = INTERPOLATE(color2, color6);
                    product2b = Q_INTERPOLATE(color3, color3, color3, product2b);
                    product1a = Q_INTERPOLATE(color5, color5, color5, product1a);

                    product2a = product1b = INTERPOLATE(color5, color3);
                    product2a = Q_INTERPOLATE(color2, color2, color2, product2a);
                    product1b = Q_INTERPOLATE(color6, color6, color6, product1b);
                }
            }
            else //if (color5 == color3)
            {
                if (color2 != color6)
                {
                    product2b = product1a = color5;

                    if ((colorB1 == color5) || (color3 == colorS1))
                    {
                        product1b = INTERPOLATE(color5, color6);
                        product1b = INTERPOLATE(color5, product1b);
                    }
                    else
                    {
                        product1b = INTERPOLATE(color5, color6);
                    }

                    if ((color3 == colorA2) || (color4 == color5))
                    {
                        product2a = INTERPOLATE(color5, color2);
                        product2a = INTERPOLATE(color5, product2a);
                    }
                    else
                    {
                        product2a = INTERPOLATE(color2, color3);
                    }
                }
                else //if (color2 != color6)
                {
                    register int r = 0;

                    r += GetResult(color6, color5, color1, colorA1);
                    r += GetResult(color6, color5, color4, colorB1);
                    r += GetResult(color6, color5, colorA2, colorS1);
                    r += GetResult(color6, color5, colorB2, colorS2);

                    if (r > 0)
                    {
                        product1b = product2a = color2;
                        product1a = product2b = INTERPOLATE(color5, color6);
                    }
                    else if (r < 0)
                    {
                        product2b = product1a = color5;
                        product1b = product2a = INTERPOLATE(color5, color6);
                    }
                    else
                    {
                        product2b = product1a = color5;
                        product1b = product2a = color2;
                    }
                }
            }

            *(dP + 0) = (ushort)product1a;
            *(dP + 1) = (ushort)product1b;
            *(dP + dstPitch / 2 + 0) = (ushort)product2a;
            *(dP + dstPitch / 2 + 1) = (ushort)product2b;

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}

void SuperEagle_32(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    uint* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (uint*)dstPtr;
        for (i = 0; i < width; ++i)
        {
            uint color4, color5, color6;
            uint color1, color2, color3;
            uint colorA1, colorA2, colorB1, colorB2, colorS1, colorS2;
            uint product1a, product1b, product2a, product2b;

            colorB1 = *(bP - nextlineSrc);
            colorB2 = *(bP - nextlineSrc + 1);

            color4 = *(bP - 1);
            color5 = *(bP);
            color6 = *(bP + 1);
            colorS2 = *(bP + 2);

            color1 = *(bP + nextlineSrc - 1);
            color2 = *(bP + nextlineSrc);
            color3 = *(bP + nextlineSrc + 1);
            colorS1 = *(bP + nextlineSrc + 2);

            colorA1 = *(bP + 2 * nextlineSrc);
            colorA2 = *(bP + 2 * nextlineSrc + 1);

            //  --------------------------------------
            if (color5 != color3)
            {
                if (color2 == color6)
                {
                    product1b = product2a = color2;
                    if ((color1 == color2) || (color6 == colorB2))
                    {
                        product1a = INTERPOLATE(color2, color5);
                        product1a = INTERPOLATE(color2, product1a);
                    }
                    else
                    {
                        product1a = INTERPOLATE(color5, color6);
                    }

                    if ((color6 == colorS2) || (color2 == colorA1))
                    {
                        product2b = INTERPOLATE(color2, color3);
                        product2b = INTERPOLATE(color2, product2b);
                    }
                    else
                    {
                        product2b = INTERPOLATE(color2, color3);
                    }
                }
                else
                {
                    product2b = product1a = INTERPOLATE(color2, color6);
                    product2b = Q_INTERPOLATE(color3, color3, color3, product2b);
                    product1a = Q_INTERPOLATE(color5, color5, color5, product1a);

                    product2a = product1b = INTERPOLATE(color5, color3);
                    product2a = Q_INTERPOLATE(color2, color2, color2, product2a);
                    product1b = Q_INTERPOLATE(color6, color6, color6, product1b);
                }
            }
            else //if (color5 == color3)
            {
                if (color2 != color6)
                {
                    product2b = product1a = color5;

                    if ((colorB1 == color5) || (color3 == colorS1))
                    {
                        product1b = INTERPOLATE(color5, color6);
                        product1b = INTERPOLATE(color5, product1b);
                    }
                    else
                    {
                        product1b = INTERPOLATE(color5, color6);
                    }

                    if ((color3 == colorA2) || (color4 == color5))
                    {
                        product2a = INTERPOLATE(color5, color2);
                        product2a = INTERPOLATE(color5, product2a);
                    }
                    else
                    {
                        product2a = INTERPOLATE(color2, color3);
                    }
                }
                else //if (color2 != color6)
                {
                    register int r = 0;

                    r += GetResult(color6, color5, color1, colorA1);
                    r += GetResult(color6, color5, color4, colorB1);
                    r += GetResult(color6, color5, colorA2, colorS1);
                    r += GetResult(color6, color5, colorB2, colorS2);

                    if (r > 0)
                    {
                        product1b = product2a = color2;
                        product1a = product2b = INTERPOLATE(color5, color6);
                    }
                    else if (r < 0)
                    {
                        product2b = product1a = color5;
                        product1b = product2a = INTERPOLATE(color5, color6);
                    }
                    else
                    {
                        product2b = product1a = color5;
                        product1b = product2a = color2;
                    }
                }
            }

            *(dP + 0) = LUT16to32[(ushort)product1a];
            *(dP + 1) = LUT16to32[(ushort)product1b];
            *(dP + dstPitch / 4 + 0) = LUT16to32[(ushort)product2a];
            *(dP + dstPitch / 4 + 1) = LUT16to32[(ushort)product2b];

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}
void _2xSaI_16(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    ushort* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (ushort*)dstPtr;

        for (i = 0; i < width; ++i)
        {
            register uint colorA, colorB;
            uint colorC, colorD,
                colorE, colorF, colorG, colorH, colorI, colorJ, colorK, colorL, colorM, colorN, colorO, colorP;
            uint product, product1, product2;

            // ---------------------------------------
            //  Map of the pixels:                    I|E F|J
            //                                        G|A B|K
            //                                        H|C D|L
            //                                        M|N O|P
            colorI = *(bP - nextlineSrc - 1);
            colorE = *(bP - nextlineSrc);
            colorF = *(bP - nextlineSrc + 1);
            colorJ = *(bP - nextlineSrc + 2);

            colorG = *(bP - 1);
            colorA = *(bP);
            colorB = *(bP + 1);
            colorK = *(bP + 2);

            colorH = *(bP + nextlineSrc - 1);
            colorC = *(bP + nextlineSrc);
            colorD = *(bP + nextlineSrc + 1);
            colorL = *(bP + nextlineSrc + 2);

            colorM = *(bP + 2 * nextlineSrc - 1);
            colorN = *(bP + 2 * nextlineSrc);
            colorO = *(bP + 2 * nextlineSrc + 1);
            colorP = *(bP + 2 * nextlineSrc + 2);

            if ((colorA == colorD) && (colorB != colorC))
            {
                if (((colorA == colorE) && (colorB == colorL)) ||
                    ((colorA == colorC) && (colorA == colorF) && (colorB != colorE) && (colorB == colorJ)))
                {
                    product = colorA;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if (((colorA == colorG) && (colorC == colorO)) ||
                    ((colorA == colorB) && (colorA == colorH) && (colorG != colorC) && (colorC == colorM)))
                {
                    product1 = colorA;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
                product2 = colorA;
            }
            else if ((colorB == colorC) && (colorA != colorD))
            {
                if (((colorB == colorF) && (colorA == colorH)) ||
                    ((colorB == colorE) && (colorB == colorD) && (colorA != colorF) && (colorA == colorI)))
                {
                    product = colorB;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if (((colorC == colorH) && (colorA == colorF)) ||
                    ((colorC == colorG) && (colorC == colorD) && (colorA != colorH) && (colorA == colorI)))
                {
                    product1 = colorC;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
                product2 = colorB;
            }
            else if ((colorA == colorD) && (colorB == colorC))
            {
                if (colorA == colorB)
                {
                    product = colorA;
                    product1 = colorA;
                    product2 = colorA;
                }
                else
                {
                    register int r = 0;

                    product1 = INTERPOLATE(colorA, colorC);
                    product = INTERPOLATE(colorA, colorB);

                    r += GetResult(colorA, colorB, colorG, colorE);
                    r -= GetResult(colorB, colorA, colorK, colorF);
                    r -= GetResult(colorB, colorA, colorH, colorN);
                    r += GetResult(colorA, colorB, colorL, colorO);

                    if (r > 0)
                        product2 = colorA;
                    else if (r < 0)
                        product2 = colorB;
                    else
                    {
                        product2 = Q_INTERPOLATE(colorA, colorB, colorC, colorD);
                    }
                }
            }
            else
            {
                product2 = Q_INTERPOLATE(colorA, colorB, colorC, colorD);

                if ((colorA == colorC) && (colorA == colorF)
                    && (colorB != colorE) && (colorB == colorJ))
                {
                    product = colorA;
                }
                else if ((colorB == colorE) && (colorB == colorD)
                    && (colorA != colorF) && (colorA == colorI))
                {
                    product = colorB;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if ((colorA == colorB) && (colorA == colorH)
                    && (colorG != colorC) && (colorC == colorM))
                {
                    product1 = colorA;
                }
                else if ((colorC == colorG) && (colorC == colorD)
                    && (colorA != colorH) && (colorA == colorI))
                {
                    product1 = colorC;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
            }

            *(dP + 0) = (ushort)colorA;
            *(dP + 1) = (ushort)product;
            *(dP + dstPitch / 2 + 0) = (ushort)product1;
            *(dP + dstPitch / 2 + 1) = (ushort)product2;

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}

void _2xSaI_32(const byte* srcPtr, uint srcPitch, byte* dstPtr, uint dstPitch, int width, int height)
{
    const ushort* bP;
    uint* dP;
    const uint nextlineSrc = srcPitch >> 1;

    while (height--)
    {
        int i;
        bP = (const ushort*)srcPtr;
        dP = (uint*)dstPtr;

        for (i = 0; i < width; ++i)
        {
            register uint colorA, colorB;
            uint colorC, colorD,
                colorE, colorF, colorG, colorH, colorI, colorJ, colorK, colorL, colorM, colorN, colorO, colorP;
            uint product, product1, product2;

            // ---------------------------------------
            //  Map of the pixels:                    I|E F|J
            //                                        G|A B|K
            //                                        H|C D|L
            //                                        M|N O|P
            colorI = *(bP - nextlineSrc - 1);
            colorE = *(bP - nextlineSrc);
            colorF = *(bP - nextlineSrc + 1);
            colorJ = *(bP - nextlineSrc + 2);

            colorG = *(bP - 1);
            colorA = *(bP);
            colorB = *(bP + 1);
            colorK = *(bP + 2);

            colorH = *(bP + nextlineSrc - 1);
            colorC = *(bP + nextlineSrc);
            colorD = *(bP + nextlineSrc + 1);
            colorL = *(bP + nextlineSrc + 2);

            colorM = *(bP + 2 * nextlineSrc - 1);
            colorN = *(bP + 2 * nextlineSrc);
            colorO = *(bP + 2 * nextlineSrc + 1);
            colorP = *(bP + 2 * nextlineSrc + 2);

            if ((colorA == colorD) && (colorB != colorC))
            {
                if (((colorA == colorE) && (colorB == colorL)) ||
                    ((colorA == colorC) && (colorA == colorF) && (colorB != colorE) && (colorB == colorJ)))
                {
                    product = colorA;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if (((colorA == colorG) && (colorC == colorO)) ||
                    ((colorA == colorB) && (colorA == colorH) && (colorG != colorC) && (colorC == colorM)))
                {
                    product1 = colorA;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
                product2 = colorA;
            }
            else if ((colorB == colorC) && (colorA != colorD))
            {
                if (((colorB == colorF) && (colorA == colorH)) ||
                    ((colorB == colorE) && (colorB == colorD) && (colorA != colorF) && (colorA == colorI)))
                {
                    product = colorB;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if (((colorC == colorH) && (colorA == colorF)) ||
                    ((colorC == colorG) && (colorC == colorD) && (colorA != colorH) && (colorA == colorI)))
                {
                    product1 = colorC;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
                product2 = colorB;
            }
            else if ((colorA == colorD) && (colorB == colorC))
            {
                if (colorA == colorB)
                {
                    product = colorA;
                    product1 = colorA;
                    product2 = colorA;
                }
                else
                {
                    register int r = 0;

                    product1 = INTERPOLATE(colorA, colorC);
                    product = INTERPOLATE(colorA, colorB);

                    r += GetResult(colorA, colorB, colorG, colorE);
                    r -= GetResult(colorB, colorA, colorK, colorF);
                    r -= GetResult(colorB, colorA, colorH, colorN);
                    r += GetResult(colorA, colorB, colorL, colorO);

                    if (r > 0)
                        product2 = colorA;
                    else if (r < 0)
                        product2 = colorB;
                    else
                    {
                        product2 = Q_INTERPOLATE(colorA, colorB, colorC, colorD);
                    }
                }
            }
            else
            {
                product2 = Q_INTERPOLATE(colorA, colorB, colorC, colorD);

                if ((colorA == colorC) && (colorA == colorF)
                    && (colorB != colorE) && (colorB == colorJ))
                {
                    product = colorA;
                }
                else if ((colorB == colorE) && (colorB == colorD)
                    && (colorA != colorF) && (colorA == colorI))
                {
                    product = colorB;
                }
                else
                {
                    product = INTERPOLATE(colorA, colorB);
                }

                if ((colorA == colorB) && (colorA == colorH)
                    && (colorG != colorC) && (colorC == colorM))
                {
                    product1 = colorA;
                }
                else if ((colorC == colorG) && (colorC == colorD)
                    && (colorA != colorH) && (colorA == colorI))
                {
                    product1 = colorC;
                }
                else
                {
                    product1 = INTERPOLATE(colorA, colorC);
                }
            }

            *(dP + 0) = LUT16to32[(ushort)colorA];
            *(dP + 1) = LUT16to32[(ushort)product];
            *(dP + dstPitch / 4 + 0) = LUT16to32[(ushort)product1];
            *(dP + dstPitch / 4 + 1) = LUT16to32[(ushort)product2];

            bP += 1;
            dP += 2;
        }

        srcPtr += srcPitch;
        dstPtr += dstPitch * 2;
    }
}

/*
 * This file is part of the Advance project.
 *
 * Copyright (C) 1999-2002 Andrea Mazzoleni
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * This file contains a C and MMX implentation of the Scale2x effect.
 *
 * You can found an high level description of the effect at :
 *
 * http://scale2x.sourceforge.net/scale2x.html
 *
 * Alternatively at the previous license terms, you are allowed to use this
 * code in your program with these conditions:
 * - the program is not used in commercial activities.
 * - the whole source code of the program is released with the binary.
 * - derivative works of the program are allowed.
 */

#define MMX

/* Suggested in "Intel Optimization" for Pentium II */
#define ASM_JUMP_ALIGN ".p2align 4\n"

static void internal_scale2x_16_def(ushort* dst0, ushort* dst1, const ushort* src0, const ushort* src1, const ushort* src2, unsigned count)
{
    /* first pixel */
    dst0[0] = src1[0];
    dst1[0] = src1[0];
    if (src1[1] == src0[0] && src2[0] != src0[0])
        dst0[1] = src0[0];
    else
        dst0[1] = src1[0];
    if (src1[1] == src2[0] && src0[0] != src2[0])
        dst1[1] = src2[0];
    else
        dst1[1] = src1[0];
    ++src0;
    ++src1;
    ++src2;
    dst0 += 2;
    dst1 += 2;

    /* central pixels */
    count -= 2;
    while (count)
    {
        if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
            dst0[0] = src0[0];
        else
            dst0[0] = src1[0];
        if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
            dst0[1] = src0[0];
        else
            dst0[1] = src1[0];

        if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
            dst1[0] = src2[0];
        else
            dst1[0] = src1[0];
        if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
            dst1[1] = src2[0];
        else
            dst1[1] = src1[0];

        ++src0;
        ++src1;
        ++src2;
        dst0 += 2;
        dst1 += 2;
        --count;
    }

    /* last pixel */
    if (src1[-1] == src0[0] && src2[0] != src0[0])
        dst0[0] = src0[0];
    else
        dst0[0] = src1[0];
    if (src1[-1] == src2[0] && src0[0] != src2[0])
        dst1[0] = src2[0];
    else
        dst1[0] = src1[0];
    dst0[1] = src1[0];
    dst1[1] = src1[0];
}

static void internal_scale2x_32_def(uint* dst0,
    uint* dst1,
    const uint* src0,
    const uint* src1,
    const uint* src2,
    unsigned count)
{
    /* first pixel */
    dst0[0] = src1[0];
    dst1[0] = src1[0];
    if (src1[1] == src0[0] && src2[0] != src0[0])
        dst0[1] = src0[0];
    else
        dst0[1] = src1[0];
    if (src1[1] == src2[0] && src0[0] != src2[0])
        dst1[1] = src2[0];
    else
        dst1[1] = src1[0];
    ++src0;
    ++src1;
    ++src2;
    dst0 += 2;
    dst1 += 2;

    /* central pixels */
    count -= 2;
    while (count)
    {
        if (src1[-1] == src0[0] && src2[0] != src0[0] && src1[1] != src0[0])
            dst0[0] = src0[0];
        else
            dst0[0] = src1[0];
        if (src1[1] == src0[0] && src2[0] != src0[0] && src1[-1] != src0[0])
            dst0[1] = src0[0];
        else
            dst0[1] = src1[0];

        if (src1[-1] == src2[0] && src0[0] != src2[0] && src1[1] != src2[0])
            dst1[0] = src2[0];
        else
            dst1[0] = src1[0];
        if (src1[1] == src2[0] && src0[0] != src2[0] && src1[-1] != src2[0])
            dst1[1] = src2[0];
        else
            dst1[1] = src1[0];

        ++src0;
        ++src1;
        ++src2;
        dst0 += 2;
        dst1 += 2;
        --count;
    }

    /* last pixel */
    if (src1[-1] == src0[0] && src2[0] != src0[0])
        dst0[0] = src0[0];
    else
        dst0[0] = src1[0];
    if (src1[-1] == src2[0] && src0[0] != src2[0])
        dst1[0] = src2[0];
    else
        dst1[0] = src1[0];
    dst0[1] = src1[0];
    dst1[1] = src1[0];
}

// #if defined (MMX) && (defined (X86_ASSEMBLY) || defined (X86_MSVC_ASSEMBLY))
// static void internal_scale2x_16_mmx_single(ushort* dst, const ushort* src0, const ushort* src1, const ushort* src2, unsigned count) {
//     /* always do the first and last run */
//     count -= 2 * 4;
//
//     #ifdef __GNUC__
//     __asm__ __volatile__ (
//         /* first run */
//         /* set the current, current_pre, current_next registers */
//         "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
//         "movq 0(%1),%%mm7\n"
//         "movq 8(%1),%%mm1\n"
//         "psrlq $48,%%mm0\n"
//         "psllq $48,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $16,%%mm2\n"
//         "psrlq $16,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqw %%mm6,%%mm2\n"
//         "pcmpeqw %%mm6,%%mm4\n"
//         "pcmpeqw (%2),%%mm3\n"
//         "pcmpeqw (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqw %%mm1,%%mm2\n"
//         "pcmpeqw %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpcklwd %%mm4,%%mm2\n"
//         "punpckhwd %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//
//         /* next */
//         "addl $8,%0\n"
//         "addl $8,%1\n"
//         "addl $8,%2\n"
//         "addl $16,%3\n"
//
//         /* central runs */
//         "shrl $2,%4\n"
//         "jz 1f\n"
//         ASM_JUMP_ALIGN
//         "0:\n"
//
//         /* set the current, current_pre, current_next registers */
//         "movq -8(%1),%%mm0\n"
//         "movq (%1),%%mm7\n"
//         "movq 8(%1),%%mm1\n"
//         "psrlq $48,%%mm0\n"
//         "psllq $48,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $16,%%mm2\n"
//         "psrlq $16,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqw %%mm6,%%mm2\n"
//         "pcmpeqw %%mm6,%%mm4\n"
//         "pcmpeqw (%2),%%mm3\n"
//         "pcmpeqw (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqw %%mm1,%%mm2\n"
//         "pcmpeqw %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpcklwd %%mm4,%%mm2\n"
//         "punpckhwd %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//
//         /* next */
//         "addl $8,%0\n"
//         "addl $8,%1\n"
//         "addl $8,%2\n"
//         "addl $16,%3\n"
//
//         "decl %4\n"
//         "jnz 0b\n"
//         "1:\n"
//
//         /* final run */
//         /* set the current, current_pre, current_next registers */
//         "movq -8(%1),%%mm0\n"
//         "movq (%1),%%mm7\n"
//         "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
//         "psrlq $48,%%mm0\n"
//         "psllq $48,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $16,%%mm2\n"
//         "psrlq $16,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqw %%mm6,%%mm2\n"
//         "pcmpeqw %%mm6,%%mm4\n"
//         "pcmpeqw (%2),%%mm3\n"
//         "pcmpeqw (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqw %%mm1,%%mm2\n"
//         "pcmpeqw %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpcklwd %%mm4,%%mm2\n"
//         "punpckhwd %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//         "emms\n"
//
//         : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
//         :
//         : "cc"
//         );
//     #else
//     __asm {
//         mov eax, src0;
//         mov ebx, src1;
//         mov ecx, src2;
//         mov edx, dst;
//         mov esi, count;
//
//         /* first run */
//         /* set the current, current_pre, current_next registers */
//         pxor mm0, mm0; /* use a fake black out of screen */
//         movq mm7, qword ptr[ebx];
//         movq mm1, qword ptr[ebx + 8];
//         psrlq mm0, 48;
//         psllq mm1, 48;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 16;
//         psrlq mm3, 16;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqw mm2, mm6;
//         pcmpeqw mm4, mm6;
//         pcmpeqw mm3, qword ptr[ecx];
//         pcmpeqw mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqw mm2, mm1;
//         pcmpeqw mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpcklwd mm2, mm4;
//         punpckhwd mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         /* next */
//         add eax, 8;
//         add ebx, 8;
//         add ecx, 8;
//         add edx, 16;
//
//         /* central runs */
//         shr esi, 2;
//         jz label1;
//         align 4;
// label0:
//
//         /* set the current, current_pre, current_next registers */
//         movq mm0, qword ptr[ebx - 8];
//         movq mm7, qword ptr[ebx];
//         movq mm1, qword ptr[ebx + 8];
//         psrlq mm0, 48;
//         psllq mm1, 48;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 16;
//         psrlq mm3, 16;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqw mm2, mm6;
//         pcmpeqw mm4, mm6;
//         pcmpeqw mm3, qword ptr[ecx];
//         pcmpeqw mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqw mm2, mm1;
//         pcmpeqw mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpcklwd mm2, mm4;
//         punpckhwd mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         /* next */
//         add eax, 8;
//         add ebx, 8;
//         add ecx, 8;
//         add edx, 16;
//
//         dec esi;
//         jnz label0;
// label1:
//
//         /* final run */
//         /* set the current, current_pre, current_next registers */
//         movq mm0, qword ptr[ebx - 8];
//         movq mm7, qword ptr[ebx];
//         pxor mm1, mm1; /* use a fake black out of screen */
//         psrlq mm0, 48;
//         psllq mm1, 48;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 16;
//         psrlq mm3, 16;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqw mm2, mm6;
//         pcmpeqw mm4, mm6;
//         pcmpeqw mm3, qword ptr[ecx];
//         pcmpeqw mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqw mm2, mm1;
//         pcmpeqw mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpcklwd mm2, mm4;
//         punpckhwd mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         mov src0, eax;
//         mov src1, ebx;
//         mov src2, ecx;
//         mov dst, edx;
//         mov count, esi;
//
//         emms;
//     }
//     #endif
// }
//
// static void internal_scale2x_32_mmx_single(uint* dst, const uint* src0, const uint* src1, const uint* src2, unsigned count) {
//     /* always do the first and last run */
//     count -= 2 * 2;
//
//     #ifdef __GNUC__
//     __asm__ __volatile__ (
//         /* first run */
//         /* set the current, current_pre, current_next registers */
//         "pxor %%mm0,%%mm0\n" /* use a fake black out of screen */
//         "movq 0(%1),%%mm7\n"
//         "movq 8(%1),%%mm1\n"
//         "psrlq $32,%%mm0\n"
//         "psllq $32,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $32,%%mm2\n"
//         "psrlq $32,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqd %%mm6,%%mm2\n"
//         "pcmpeqd %%mm6,%%mm4\n"
//         "pcmpeqd (%2),%%mm3\n"
//         "pcmpeqd (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqd %%mm1,%%mm2\n"
//         "pcmpeqd %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpckldq %%mm4,%%mm2\n"
//         "punpckhdq %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//
//         /* next */
//         "addl $8,%0\n"
//         "addl $8,%1\n"
//         "addl $8,%2\n"
//         "addl $16,%3\n"
//
//         /* central runs */
//         "shrl $1,%4\n"
//         "jz 1f\n"
//         ASM_JUMP_ALIGN
//         "0:\n"
//
//         /* set the current, current_pre, current_next registers */
//         "movq -8(%1),%%mm0\n"
//         "movq (%1),%%mm7\n"
//         "movq 8(%1),%%mm1\n"
//         "psrlq $32,%%mm0\n"
//         "psllq $32,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $32,%%mm2\n"
//         "psrlq $32,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqd %%mm6,%%mm2\n"
//         "pcmpeqd %%mm6,%%mm4\n"
//         "pcmpeqd (%2),%%mm3\n"
//         "pcmpeqd (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqd %%mm1,%%mm2\n"
//         "pcmpeqd %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpckldq %%mm4,%%mm2\n"
//         "punpckhdq %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//
//         /* next */
//         "addl $8,%0\n"
//         "addl $8,%1\n"
//         "addl $8,%2\n"
//         "addl $16,%3\n"
//
//         "decl %4\n"
//         "jnz 0b\n"
//         "1:\n"
//
//         /* final run */
//         /* set the current, current_pre, current_next registers */
//         "movq -8(%1),%%mm0\n"
//         "movq (%1),%%mm7\n"
//         "pxor %%mm1,%%mm1\n" /* use a fake black out of screen */
//         "psrlq $32,%%mm0\n"
//         "psllq $32,%%mm1\n"
//         "movq %%mm7,%%mm2\n"
//         "movq %%mm7,%%mm3\n"
//         "psllq $32,%%mm2\n"
//         "psrlq $32,%%mm3\n"
//         "por %%mm2,%%mm0\n"
//         "por %%mm3,%%mm1\n"
//
//         /* current_upper */
//         "movq (%0),%%mm6\n"
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "movq %%mm0,%%mm3\n"
//         "movq %%mm1,%%mm5\n"
//         "pcmpeqd %%mm6,%%mm2\n"
//         "pcmpeqd %%mm6,%%mm4\n"
//         "pcmpeqd (%2),%%mm3\n"
//         "pcmpeqd (%2),%%mm5\n"
//         "pandn %%mm2,%%mm3\n"
//         "pandn %%mm4,%%mm5\n"
//         "movq %%mm0,%%mm2\n"
//         "movq %%mm1,%%mm4\n"
//         "pcmpeqd %%mm1,%%mm2\n"
//         "pcmpeqd %%mm0,%%mm4\n"
//         "pandn %%mm3,%%mm2\n"
//         "pandn %%mm5,%%mm4\n"
//         "movq %%mm2,%%mm3\n"
//         "movq %%mm4,%%mm5\n"
//         "pand %%mm6,%%mm2\n"
//         "pand %%mm6,%%mm4\n"
//         "pandn %%mm7,%%mm3\n"
//         "pandn %%mm7,%%mm5\n"
//         "por %%mm3,%%mm2\n"
//         "por %%mm5,%%mm4\n"
//
//         /* set *dst0 */
//         "movq %%mm2,%%mm3\n"
//         "punpckldq %%mm4,%%mm2\n"
//         "punpckhdq %%mm4,%%mm3\n"
//         "movq %%mm2,(%3)\n"
//         "movq %%mm3,8(%3)\n"
//         "emms\n"
//
//         : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
//         :
//         : "cc"
//         );
//     #else
//     __asm {
//         mov eax, src0;
//         mov ebx, src1;
//         mov ecx, src2;
//         mov edx, dst;
//         mov esi, count;
//
//         /* first run */
//         /* set the current, current_pre, current_next registers */
//         pxor mm0, mm0;
//         movq mm7, qword ptr[ebx];
//         movq mm1, qword ptr[ebx + 8];
//         psrlq mm0, 32;
//         psllq mm1, 32;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 32;
//         psrlq mm3, 32;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqd mm2, mm6;
//         pcmpeqd mm4, mm6;
//         pcmpeqd mm3, qword ptr[ecx];
//         pcmpeqd mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqd mm2, mm1;
//         pcmpeqd mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpckldq mm2, mm4;
//         punpckhdq mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         /* next */
//         add eax, 8;
//         add ebx, 8;
//         add ecx, 8;
//         add edx, 16;
//
//         /* central runs */
//         shr esi, 1;
//         jz label1;
// label0:
//
//         /* set the current, current_pre, current_next registers */
//         movq mm0, qword ptr[ebx - 8];
//         movq mm7, qword ptr[ebx];
//         movq mm1, qword ptr[ebx + 8];
//         psrlq mm0, 32;
//         psllq mm1, 32;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 32;
//         psrlq mm3, 32;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqd mm2, mm6;
//         pcmpeqd mm4, mm6;
//         pcmpeqd mm3, qword ptr[ecx];
//         pcmpeqd mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqd mm2, mm1;
//         pcmpeqd mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpckldq mm2, mm4;
//         punpckhdq mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         /* next */
//         add eax, 8;
//         add ebx, 8;
//         add ecx, 8;
//         add edx, 16;
//
//         dec esi;
//         jnz label0;
// label1:
//
//         /* final run */
//         /* set the current, current_pre, current_next registers */
//         movq mm0, qword ptr[ebx - 8];
//         movq mm7, qword ptr[ebx];
//         pxor mm1, mm1;
//         psrlq mm0, 32;
//         psllq mm1, 32;
//         movq mm2, mm7;
//         movq mm3, mm7;
//         psllq mm2, 32;
//         psrlq mm3, 32;
//         por mm0, mm2;
//         por mm1, mm3;
//
//         /* current_upper */
//         movq mm6, qword ptr[eax];
//
//         /* compute the upper-left pixel for dst0 on %%mm2 */
//         /* compute the upper-right pixel for dst0 on %%mm4 */
//         movq mm2, mm0;
//         movq mm4, mm1;
//         movq mm3, mm0;
//         movq mm5, mm1;
//         pcmpeqd mm2, mm6;
//         pcmpeqd mm4, mm6;
//         pcmpeqd mm3, qword ptr[ecx];
//         pcmpeqd mm5, qword ptr[ecx];
//         pandn mm3, mm2;
//         pandn mm5, mm4;
//         movq mm2, mm0;
//         movq mm4, mm1;
//         pcmpeqd mm2, mm1;
//         pcmpeqd mm4, mm0;
//         pandn mm2, mm3;
//         pandn mm4, mm5;
//         movq mm3, mm2;
//         movq mm5, mm4;
//         pand mm2, mm6;
//         pand mm4, mm6;
//         pandn mm3, mm7;
//         pandn mm5, mm7;
//         por mm2, mm3;
//         por mm4, mm5;
//
//         /* set *dst0 */
//         movq mm3, mm2;
//         punpckldq mm2, mm4;
//         punpckhdq mm3, mm4;
//         movq qword ptr[edx], mm2;
//         movq qword ptr[edx + 8], mm3;
//
//         mov src0, eax;
//         mov src1, ebx;
//         mov src2, ecx;
//         mov dst, edx;
//         mov count, esi;
//
//         emms;
//     }
//     #endif
// }
//
// static void internal_scale2x_16_mmx(ushort* dst0, ushort* dst1, const ushort* src0, const ushort* src1, const ushort* src2, unsigned count) {
//     //	ASSERT( count >= 2*4 );
//     internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
//     internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
// }
//
// static void internal_scale2x_32_mmx(uint* dst0, uint* dst1, const uint* src0, const uint* src1, const uint* src2, unsigned count) {
//     //	ASSERT( count >= 2*2 );
//     internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
//     internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
// }
// #endif

void AdMame2x(byte* srcPtr, uint srcPitch, /* byte deltaPtr, */
    byte* dstPtr, uint dstPitch, int width, int height)
{
    ushort* dst0 = (ushort*)dstPtr;
    ushort* dst1 = dst0 + (dstPitch / 2);

    ushort* src0 = (ushort*)srcPtr;
    ushort* src1 = src0 + (srcPitch / 2);
    ushort* src2 = src1 + (srcPitch / 2);

    // #if defined (MMX) && (defined (X86_ASSEMBLY) || defined (X86_MSVC_ASSEMBLY))
    // if (cpu_mmx)
    // {
    //     internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
    //
    //     {
    //         int count = height;
    //
    //         count -= 2;
    //         while (count)
    //         {
    //             dst0 += dstPitch;
    //             dst1 += dstPitch;
    //             internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
    //             src0 = src1;
    //             src1 = src2;
    //             src2 += srcPitch / 2;
    //             --count;
    //         }
    //     }
    //     dst0 += dstPitch;
    //     dst1 += dstPitch;
    //     internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
    // }
    // else
    // {
    // #endif

    internal_scale2x_16_def(dst0, dst1, src0, src0, src1, width);

    {
        int count = height;

        count -= 2;
        while (count)
        {
            dst0 += dstPitch;
            dst1 += dstPitch;
            internal_scale2x_16_def(dst0, dst1, src0, src1, src2, width);
            src0 = src1;
            src1 = src2;
            src2 += srcPitch / 2;
            --count;
        }
    }
    dst0 += dstPitch;
    dst1 += dstPitch;
    internal_scale2x_16_def(dst0, dst1, src0, src1, src1, width);
    // #if defined (MMX) && (defined (X86_ASSEMBLY) || defined (X86_MSVC_ASSEMBLY))
    // }
    // #endif
}

void AdMame2x32(byte* srcPtr, uint srcPitch, /* byte deltaPtr, */
    byte* dstPtr, uint dstPitch, int width, int height)
{
    uint* dst0 = (uint*)dstPtr;
    uint* dst1 = dst0 + (dstPitch / 4);

    uint* src0 = (uint*)srcPtr;
    uint* src1 = src0 + (srcPitch / 4);
    uint* src2 = src1 + (srcPitch / 4);

    // #if defined (MMX) && (defined (X86_ASSEMBLY) || defined (X86_MSVC_ASSEMBLY))
    // if (cpu_mmx)
    // {
    //     internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
    //
    //     {
    //         int count = height;
    //
    //         count -= 2;
    //         while (count)
    //         {
    //             dst0 += dstPitch / 2;
    //             dst1 += dstPitch / 2;
    //             internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
    //             src0 = src1;
    //             src1 = src2;
    //             src2 += srcPitch / 4;
    //             --count;
    //         }
    //     }
    //     dst0 += dstPitch / 2;
    //     dst1 += dstPitch / 2;
    //     internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
    // }
    // else
    // {
    // #endif

    internal_scale2x_32_def(dst0, dst1, src0, src0, src1, width);
    {
        int count = height;

        count -= 2;
        while (count)
        {
            dst0 += dstPitch / 2;
            dst1 += dstPitch / 2;
            internal_scale2x_32_def(dst0, dst1, src0, src1, src2, width);
            src0 = src1;
            src1 = src2;
            src2 += srcPitch / 4;
            --count;
        }
    }
    dst0 += dstPitch / 2;
    dst1 += dstPitch / 2;
    internal_scale2x_32_def(dst0, dst1, src0, src1, src1, width);

    // #if defined (MMX) && (defined (X86_ASSEMBLY) || defined (X86_MSVC_ASSEMBLY))
    // }
    // #endif
}

#define swap16(x) (((x >> 24) & 0x000000ff) | ((x >> 8) & 0x0000ff00) | ((x << 8) & 0x00ff0000) | ((x << 16) & 0xff000000))

void hq_init(int rb, int gb, int bb, int rs, int gs, int bs)
{
    int i, j, k, r, g, b, Y, u, v;
    int rmask, gmask, bmask;

    rmask = ((1 << rb) - 1) << rs;
    gmask = ((1 << gb) - 1) << gs;
    bmask = ((1 << bb) - 1) << bs;

    for (i = 0; i < 65536; i++)
        LUT16to32[i] = ((i & 0xF800) << 8) + ((i & 0x07E0) << 5) + ((i & 0x001F) << 3);

    for (i = 0; i < 32; i++)
    {
        for (j = 0; j < 64; j++)
        {
            for (k = 0; k < 32; k++)
            {
                r = i << 3;
                g = j << 2;
                b = k << 3;
                Y = (r + g + b) >> 2;
                u = 128 + ((r - b) >> 2);
                v = 128 + ((-r + 2 * g - b) >> 3);
                RGBtoYUV[(i << 11) + (j << 5) + k] = (Y << 16) + (u << 8) + v;
            }
        }
    }
}

// }